Collate larval size data

Author

Max Lindmark

Published

2024-11-27

# Load libraries
library(tidyverse)
library(tidyterra)
library(tidync)
library(ggridges)
library(readxl)
library(janitor)
library(lubridate)
library(sdmTMB)
library(ncdf4)
library(patchwork)
library(terra)
library(viridis)
library(devtools)
library(ggsidekick)
theme_set(theme_sleek())
library(crayon)
library(marmap)
library(tidylog)

# Point to wd
home <- here::here()

# Load all custom functions in R/function
# - map-plot [source_url("https://raw.githubusercontent.com/maxlindmark/cod-interactions/main/R/functions/map-plot.R")]
# - callCopernicusCovariate
# - extractCovariateAtLocation
for (fun in list.files(paste0(home, "/R/functions"))) {
  source(paste(home, "R/functions", fun, sep = "/"))
}

Explore data

Read and clean data

Old data

# 1992 -2008
length_old <- read_excel(paste0(home, "/data/larvea/1992_2010 MIK SWE Alla arter.xlsx"),
  sheet = 1,
  skip = 20
)

# Clean data!
length_old <- length_old |>
  mutate(species = fct_recode(species, "Chirolophis ascanii" = "Chirolophis  ascanii")) |>
  clean_names() |>
  pivot_longer(cols = x3:x130, names_to = "length_mm", values_to = "n") |>
  dplyr::select(year, day, month, haul, species, n, length_mm) |>
  mutate(length_mm = as.numeric(str_remove(length_mm, "x"))) |>
  # NA n means there was no recorded larvae in that size group (note there are also 0s in the data)
  drop_na(n) |>
  filter(n > 0) |>
  # expand the data so that 1 row is 1 individual
  # FIXME: the counts are not integers?! (hence the need for round)
  uncount(round(n)) |>
  dplyr::select(-n) |>
  # make a unique haul ID so that we can match with trawl data
  mutate(haul_id = paste(year, month, day, haul, sep = "_"))

# Read trawl data and match in coordinates
trawl_old <- read_excel(paste0(home, "/data/larvea/1992-2010 MIK SWE Tråldata.xlsx"),
  sheet = 1,
  skip = 8
) |>
  clean_names() |>
  # the last two coordinate columns are decimal degrees of haul position
  rename(
    haul = haul_no,
    lat = lat_decim_20,
    lon = long_decim_21
  ) |>
  # two rows without info, including year, so I'm dropping these
  drop_na(year) |>
  mutate(haul_id = paste(year, month, day, haul, sep = "_")) |>
  distinct(haul_id, .keep_all = TRUE) |>
  dplyr::select(haul_id, lat, lon, temp)

# Join trawl data to length data
length_old <- length_old |>
  left_join(trawl_old, by = "haul_id") |>
  dplyr::select(-haul) |>
  mutate(
    period = "old",
    day = as.numeric(day),
    month = as.numeric(month)
  )

New data

# 2008-2022
length_new <- read_excel(paste0(home, "/data/larvea/ELDB(s) bara fisk 2008-2024.xlsx")) |>
  clean_names() |>
  mutate(
    species = fct_recode(species, "chirolophis ascanii" = "chirolophis  ascanii"),
    species = str_to_sentence(species)
  ) |>
  rename(length_mm = length) |>
  dplyr::select(haul_id, species, length_mm, number) |>
  drop_na(length_mm) |>
  # expand the data frame with "number"
  uncount(round(number))

#trawl_new <- read_excel(paste0(home, "/data/larvea/ELDB 2008-2024.xlsx")) |>
trawl_new <- read_excel(paste0(home, "/data/larvea/ELDB 2008-2024 - rättade djup.xlsx")) |>
  clean_names() |>
  rename(
    lat = start_latitud,
    lon = start_longitud
  ) |>
  dplyr::select(year, day, month, haul_id, lat, lon, sur_temp) |>
  rename(temp = sur_temp)

length_new <- length_new |>
  left_join(trawl_new, by = "haul_id") |>
  mutate(
    period = "new",
    year = as.numeric(year),
    day = as.numeric(day),
    month = as.numeric(month)
  )

Join old and new

overlapping_yrs <- intersect(length_new$year, length_old$year)

length_old <- length_old |> filter(!year %in% c(overlapping_yrs))

d <- bind_rows(length_new, length_old) |>
  mutate(yday = yday(paste(year, month, day, sep = "-"))) |>
  filter(lon > 8) |>
  drop_na(lat) |>
  rename(temp_obs = temp)

# Add km UTM coords
d <- d |>
  add_utm_columns(ll_names = c("lon", "lat"))

sort(unique(d$species))
 [1] "Aequorea"                     "Agonus cataphractus"         
 [3] "Alloteuthis subulata"         "Ammodytes marinus"           
 [5] "Ammodytidae"                  "Anarhichas lupus"            
 [7] "Anguilla anguilla"            "Aphia minuta"                
 [9] "Argentina silus"              "Argentina sphyraena"         
[11] "Argentina spp"                "Argentinidae"                
[13] "Arnoglossus laterna"          "Branchiostoma lanceolatum"   
[15] "Buglossidium luteum"          "Callionymidae"               
[17] "Callionymus lyra"             "Callionymus maculatus"       
[19] "Callionymus reticulatus"      "Chirolophis ascanii"         
[21] "Clupea harengus"              "CLUPEA HARENGUS ADULT"       
[23] "Clupeidae"                    "Cottidae"                    
[25] "Cottus spp"                   "Crystallogobius linearis"    
[27] "Cyclopterus lumpus"           "Echiodon drummondi"          
[29] "Enchelyopus cimbrius"         "Engraulis encrasicholus"     
[31] "Engraulis encrasicolus"       "Entelurus aequoreus"         
[33] "Eutrigla gurnardus"           "Gadidae"                     
[35] "Gadus morhua"                 "Gaidropsarus argentatus"     
[37] "Gasterosteus aculeatus"       "Glyptocephalus cynoglossus"  
[39] "Gobiidae"                     "Hippoglossoides platessoides"
[41] "Hyperoplus lanceolatus"       "Illex"                       
[43] "Lebetus scorpioides"          "Limanda limanda"             
[45] "Liparis montagui"             "Loligo"                      
[47] "Lumpenus lampretaeformis"     "Maurolicus muelleri"         
[49] "Merlangius merlangus"         "Merluccius merluccius"       
[51] "Microstomus kitt"             "Molva molva"                 
[53] "Mugilidae"                    "Myoxocephalus scorpioides"   
[55] "Myoxocephalus scorpius"       "Nerophis lumbriciformis"     
[57] "Osmerus eperlanus"            "Pholis gunnellus"            
[59] "Phrynorhombus norvegicus"     "Phycidae"                    
[61] "Physidae"                     "Pleuronectes platessa"       
[63] "Pleuronectidae"               "Pleuronectiformes"           
[65] "Pollachius pollachius"        "Pomatoschistus sp"           
[67] "Sardina pilchardus"           "Sepiola atlantica"           
[69] "Solea solea"                  "Sprattus sprattus"           
[71] "SPRATTUS SPRATTUS ADULT"      "Syngnathus acus"             
[73] "Syngnathus rostellatus"       "Syngnathus spp"              
[75] "Syngnathus typhle"            "Taurulus bubalis"            
[77] "Trachurus trachurus"          "Triglidae"                   
[79] "Trisopterus esmarkii"         "UNIDENTIFIED"                
d <- d |> mutate(species = ifelse(species == "Ammodytes marinus", "Ammodytidae", species))

Explore data

# Sample size
plot_map_fc +
  geom_point(
    data = d |>
      group_by(haul_id, Y, X, year) |>
      summarise(n = n()),
    aes(X * 1000, Y * 1000, color = n),
    size = 0.5
  ) +
  facet_wrap(~year, ncol = 8) +
  scale_color_viridis(trans = "sqrt") +
  ggtitle("Sample size per haul")

# Day of the year
plot_map_fc +
  geom_point(
    data = d |>
      distinct(haul_id, .keep_all = TRUE),
    aes(X * 1000, Y * 1000, color = yday),
    size = 0.5
  ) +
  facet_wrap(~year, ncol = 8) +
  scale_color_viridis(trans = "sqrt") +
  ggtitle("Day of the year of sampling in space")

# Which dates are sampled?
d |>
  ggplot(aes(as.factor(month))) +
  scale_fill_viridis(discrete = TRUE) +
  geom_histogram(stat = "count")

d |>
  ggplot(aes(x = yday, y = as.factor(year), fill = after_stat(x))) +
  scale_fill_viridis(alpha = 0.8, name = "") +
  geom_density_ridges_gradient(alpha = 0.75) +
  theme_facet_map() +
  labs(y = "year")

# Species
sort(unique(d$species))
 [1] "Aequorea"                     "Agonus cataphractus"         
 [3] "Alloteuthis subulata"         "Ammodytidae"                 
 [5] "Anarhichas lupus"             "Anguilla anguilla"           
 [7] "Aphia minuta"                 "Argentina silus"             
 [9] "Argentina sphyraena"          "Argentina spp"               
[11] "Argentinidae"                 "Arnoglossus laterna"         
[13] "Branchiostoma lanceolatum"    "Buglossidium luteum"         
[15] "Callionymidae"                "Callionymus lyra"            
[17] "Callionymus maculatus"        "Callionymus reticulatus"     
[19] "Chirolophis ascanii"          "Clupea harengus"             
[21] "CLUPEA HARENGUS ADULT"        "Clupeidae"                   
[23] "Cottidae"                     "Cottus spp"                  
[25] "Crystallogobius linearis"     "Cyclopterus lumpus"          
[27] "Echiodon drummondi"           "Enchelyopus cimbrius"        
[29] "Engraulis encrasicholus"      "Engraulis encrasicolus"      
[31] "Entelurus aequoreus"          "Eutrigla gurnardus"          
[33] "Gadidae"                      "Gadus morhua"                
[35] "Gaidropsarus argentatus"      "Gasterosteus aculeatus"      
[37] "Glyptocephalus cynoglossus"   "Gobiidae"                    
[39] "Hippoglossoides platessoides" "Hyperoplus lanceolatus"      
[41] "Illex"                        "Lebetus scorpioides"         
[43] "Limanda limanda"              "Liparis montagui"            
[45] "Loligo"                       "Lumpenus lampretaeformis"    
[47] "Maurolicus muelleri"          "Merlangius merlangus"        
[49] "Merluccius merluccius"        "Microstomus kitt"            
[51] "Molva molva"                  "Mugilidae"                   
[53] "Myoxocephalus scorpioides"    "Myoxocephalus scorpius"      
[55] "Nerophis lumbriciformis"      "Osmerus eperlanus"           
[57] "Pholis gunnellus"             "Phrynorhombus norvegicus"    
[59] "Phycidae"                     "Physidae"                    
[61] "Pleuronectes platessa"        "Pleuronectidae"              
[63] "Pleuronectiformes"            "Pollachius pollachius"       
[65] "Pomatoschistus sp"            "Sardina pilchardus"          
[67] "Sepiola atlantica"            "Solea solea"                 
[69] "Sprattus sprattus"            "SPRATTUS SPRATTUS ADULT"     
[71] "Syngnathus acus"              "Syngnathus rostellatus"      
[73] "Syngnathus spp"               "Syngnathus typhle"           
[75] "Taurulus bubalis"             "Trachurus trachurus"         
[77] "Triglidae"                    "Trisopterus esmarkii"        
[79] "UNIDENTIFIED"                
# Clean species names!
d <- d |>
  mutate(species = str_to_sentence(species))

sort(unique(d$species))
 [1] "Aequorea"                     "Agonus cataphractus"         
 [3] "Alloteuthis subulata"         "Ammodytidae"                 
 [5] "Anarhichas lupus"             "Anguilla anguilla"           
 [7] "Aphia minuta"                 "Argentina silus"             
 [9] "Argentina sphyraena"          "Argentina spp"               
[11] "Argentinidae"                 "Arnoglossus laterna"         
[13] "Branchiostoma lanceolatum"    "Buglossidium luteum"         
[15] "Callionymidae"                "Callionymus lyra"            
[17] "Callionymus maculatus"        "Callionymus reticulatus"     
[19] "Chirolophis ascanii"          "Clupea harengus"             
[21] "Clupea harengus adult"        "Clupeidae"                   
[23] "Cottidae"                     "Cottus spp"                  
[25] "Crystallogobius linearis"     "Cyclopterus lumpus"          
[27] "Echiodon drummondi"           "Enchelyopus cimbrius"        
[29] "Engraulis encrasicholus"      "Engraulis encrasicolus"      
[31] "Entelurus aequoreus"          "Eutrigla gurnardus"          
[33] "Gadidae"                      "Gadus morhua"                
[35] "Gaidropsarus argentatus"      "Gasterosteus aculeatus"      
[37] "Glyptocephalus cynoglossus"   "Gobiidae"                    
[39] "Hippoglossoides platessoides" "Hyperoplus lanceolatus"      
[41] "Illex"                        "Lebetus scorpioides"         
[43] "Limanda limanda"              "Liparis montagui"            
[45] "Loligo"                       "Lumpenus lampretaeformis"    
[47] "Maurolicus muelleri"          "Merlangius merlangus"        
[49] "Merluccius merluccius"        "Microstomus kitt"            
[51] "Molva molva"                  "Mugilidae"                   
[53] "Myoxocephalus scorpioides"    "Myoxocephalus scorpius"      
[55] "Nerophis lumbriciformis"      "Osmerus eperlanus"           
[57] "Pholis gunnellus"             "Phrynorhombus norvegicus"    
[59] "Phycidae"                     "Physidae"                    
[61] "Pleuronectes platessa"        "Pleuronectidae"              
[63] "Pleuronectiformes"            "Pollachius pollachius"       
[65] "Pomatoschistus sp"            "Sardina pilchardus"          
[67] "Sepiola atlantica"            "Solea solea"                 
[69] "Sprattus sprattus"            "Sprattus sprattus adult"     
[71] "Syngnathus acus"              "Syngnathus rostellatus"      
[73] "Syngnathus spp"               "Syngnathus typhle"           
[75] "Taurulus bubalis"             "Trachurus trachurus"         
[77] "Triglidae"                    "Trisopterus esmarkii"        
[79] "Unidentified"                
d |>
  group_by(year, species) |>
  summarise(n = n()) |>
  ggplot(aes(year, n, fill = species)) +
  guides(fill = "none") +
  facet_wrap(~species, scales = "free_y") +
  geom_bar(stat = "identity")

# Filter species with at least 6 years of data and minimum 5 sizes per year and 10 samples
d <- d |>
  # At least 5 samples by year
  mutate(n = n(), .by = c(species, year)) |>
  filter(n >= 10) |>
  mutate(n_years = length(unique(year)), .by = species) |>
  filter(n_years >= 7) |>
  dplyr::select(-n_years, -n)

sort(unique(d$species))
 [1] "Agonus cataphractus"      "Ammodytidae"             
 [3] "Anguilla anguilla"        "Aphia minuta"            
 [5] "Chirolophis ascanii"      "Clupea harengus"         
 [7] "Crystallogobius linearis" "Microstomus kitt"        
 [9] "Myoxocephalus scorpius"   "Pholis gunnellus"        
[11] "Pomatoschistus sp"        "Sardina pilchardus"      
[13] "Sprattus sprattus"        "Sprattus sprattus adult" 
[15] "Syngnathus rostellatus"  

Trim data

Sizes are here transition to post-larvae (after which they metamorphose into juveniles). In some cases, the distribution of sizes overlap a lot with the post-larve size. In those cases we go for the size where they move from pelagic to benthic habitats (which often is what splits the size-distributions). Some species are also not larvae.

sort(unique(d$species))
 [1] "Agonus cataphractus"      "Ammodytidae"             
 [3] "Anguilla anguilla"        "Aphia minuta"            
 [5] "Chirolophis ascanii"      "Clupea harengus"         
 [7] "Crystallogobius linearis" "Microstomus kitt"        
 [9] "Myoxocephalus scorpius"   "Pholis gunnellus"        
[11] "Pomatoschistus sp"        "Sardina pilchardus"      
[13] "Sprattus sprattus"        "Sprattus sprattus adult" 
[15] "Syngnathus rostellatus"  
# Agonus cataphractus: Trim the 3 data points above postlarvale sizes of 20 mm
d <- d |> filter(!(species == "Agonus cataphractus" & length_mm > 20))
filter: removed 2 rows (<1%), 55,500 rows remaining
# Ammodytidae: Two species, don't foget to add ammodyteas into this group in the data processing script, since identification to species level likely has changed over time! There also appears to be two cohorts. Post-larvae cutoff at 26-30 mm cutoff at post-larvae for the two species, but here perhaps we want to split around 55 mm to separate the clusters?

d |>
  filter((species == "Ammodytidae" & length_mm < 57)) |>
  ggplot(aes(length_mm)) +
  geom_histogram()
filter: removed 54,234 rows (98%), 1,266 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d <- d |> filter(!(species == "Ammodytidae" & length_mm > 58))
filter: removed 148 rows (<1%), 55,352 rows remaining
# Anguilla anguilla: Ok
# Aphia minuta: not larvae but can include anyway. Remove the one outlier
d |>
  filter(species == "Aphia minuta") |>
  ggplot(aes(length_mm)) +
  geom_histogram()
filter: removed 48,209 rows (87%), 7,143 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d <- d |> filter(!(species == "Aphia minuta" & length_mm > 60))
filter: removed one row (<1%), 55,351 rows remaining
# Argentina silus: Merge with Argentina spp? Seems like identification to species level changed over time. Post larvae under 50 which seems fitting for a cutoff size
d |>
  filter(species %in% c("Argentina silus", "Argentina spp")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed all rows (100%)

d <- d |>
  filter(!(species %in% c("Argentina silus", "Argentina spp") & length_mm > 50)) |>
  mutate(species = ifelse(species == "Argentina silus", "Argentina spp", species))
filter: no rows removed
mutate: no changes
# Chirolophis ascanii: Post-larvae around 20 mm. However, data indicate a size for metamorphosis around 30 mm (since they then settle on the seabead).
# d |>
#   filter(species == "Chirolophis ascanii") |>
#   ggplot(aes(length_mm)) +
#   geom_histogram()
#
# d <- d |> filter(!(species == "Chirolophis ascanii" & length_mm > 30))

# Clupea harengus: Post-larve until 48-50 mm, seems fitting
d |>
  filter(species %in% c("Clupea harengus")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed 40,780 rows (74%), 14,571 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d <- d |> filter(!(species == "Clupea harengus" & length_mm > 50))
filter: removed 6 rows (<1%), 55,345 rows remaining
# Crystallogobius linearis: Adult, otherwise seems OK. Malin was going to check something with the gear: are we sampling a different part of the size-distribution now?
# Enchelyopus cimbrius: Potentially different cohorts because spawning time is long and potentially not accurate in the literature given the sizes we observe here. <span style="color:red;">Drop this from the study</span>
d <- d |> filter(!(species == "Enchelyopus cimbrius"))
filter: no rows removed
# Limanda limanda: Post-larve may remain pelagic until 30 mm, use that as a cutoff.
d |>
  filter(species %in% c("Limanda limanda")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed all rows (100%)

d <- d |> filter(!(species == "Limanda limanda" & length_mm > 30))
filter: no rows removed
# Maurolicus muelleri: These are not larvae and not spawning frequently here. <span style="color:red;">Drop this from the study</span>
d <- d |> filter(!(species == "Maurolicus muelleri"))
filter: no rows removed
# Microstomus kitt: metamorphosis at 18 mm, but still pelagic? we judge that they are pelagic until 40 mm
d |>
  filter(species %in% c("Microstomus kitt")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed 54,711 rows (99%), 634 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d <- d |> filter(!(species == "Microstomus kitt" & length_mm > 40))
filter: removed 5 rows (<1%), 55,340 rows remaining
# Myoxocephalus scorpius: Pelagic until about 20 mm, use that as a cutoff
d |>
  filter(species %in% c("Myoxocephalus scorpius")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed 54,793 rows (99%), 547 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d <- d |> filter(!(species == "Myoxocephalus scorpius" & length_mm > 20))
filter: no rows removed
# Pholis gunnellus: pelagic until 35 mm, use that as a cutoff
d |>
  filter(species %in% c("Pholis gunnellus")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed 49,759 rows (90%), 5,581 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d <- d |> filter(!(species == "Pholis gunnellus" & length_mm > 35))
filter: removed 14 rows (<1%), 55,326 rows remaining
# Pomatoschistus sp: Not larve, could keep for the same reason as other non-larvae species

# Sardina pilchardus: Ok
# Sprattus sprattus: Too many adults because they don't spawn at the right time. If we trim to postlarvae, then we have too few anyway. <span style="color:red;">Drop this from the study</span>

d <- d |> filter(!species == "Sprattus sprattus")
filter: removed 943 rows (2%), 54,383 rows remaining
d <- d |> filter(!species == "Sprattus sprattus adult")
filter: removed 1,372 rows (3%), 53,011 rows remaining
# Syngnathus rostellatus: No larval stage, can keep for the same reason as other non-larvae

# Taurulus bubalis: 12 mm cutoff (is that post-larva maximum size or start?)
d |>
  filter(species %in% c("Taurulus bubalis")) |>
  ggplot(aes(length_mm, fill = species)) +
  geom_histogram()
filter: removed all rows (100%)

d <- d |> filter(!(species == "Taurulus bubalis" & length_mm > 35))
filter: no rows removed
# Add in which species are not strictly larvae
d <- d |>
  mutate(life_stage = ifelse(species %in% c(
    "Ammodytidae", "Syngnathus rostellatus",
    "Pomatoschistus sp", "Crystallogobius linearis",
    "Aphia minuta"
  ),
  "mixed", "larvae"
  ))
mutate: new variable 'life_stage' (character) with 2 unique values and 0% NA
# Plot species samples in space, color by year
plot_map_fc +
  geom_point(
    data = d,
    aes(X * 1000, Y * 1000, color = year),
    size = 0.5
  ) +
  facet_wrap(~species, ncol = 6) +
  scale_color_viridis() +
  ggtitle("Samples in space by species and year")

d |>
  group_by(year, species) |>
  summarise(n = n()) |>
  ggplot(aes(year, n, fill = species)) +
  guides(fill = "none") +
  facet_wrap(~species, scales = "free_y", ncol = 5) +
  geom_bar(stat = "identity") +
  scale_fill_viridis(discrete = TRUE)
group_by: 2 grouping variables (year, species)
summarise: now 315 rows and 3 columns, one group variable remaining (year)

d |>
  distinct(species, period) |>
  as.data.frame()
distinct: removed 52,986 rows (>99%), 25 rows remaining
                    species period
1              Aphia minuta    new
2           Clupea harengus    new
3  Crystallogobius linearis    new
4          Microstomus kitt    new
5    Syngnathus rostellatus    new
6               Ammodytidae    new
7         Pomatoschistus sp    new
8          Pholis gunnellus    new
9       Chirolophis ascanii    new
10   Myoxocephalus scorpius    new
11      Agonus cataphractus    new
12        Anguilla anguilla    new
13       Sardina pilchardus    new
14          Clupea harengus    old
15 Crystallogobius linearis    old
16             Aphia minuta    old
17      Chirolophis ascanii    old
18        Pomatoschistus sp    old
19              Ammodytidae    old
20        Anguilla anguilla    old
21         Pholis gunnellus    old
22      Agonus cataphractus    old
23         Microstomus kitt    old
24   Syngnathus rostellatus    old
25   Myoxocephalus scorpius    old

Add covariate to hauls

# Specify covariates path for simplicity
covPath <- paste0(home, "/data/covariates")

Satellite derived temperatures

https://data.marine.copernicus.eu/product/SST_BAL_SST_L4_REP_OBSERVATIONS_010_016/description

## Load satellite derived SST.
# Source: https://data.marine.copernicus.eu/product/SST_BAL_SST_L4_REP_OBSERVATIONS_010_016/download
# Print details
print(nc_open(paste(covPath, "sst", "DMI_BAL_SST_L4_REP_OBSERVATIONS_010_016_1711802008633.nc", sep = "/")))
File /Users/maxlindmark/Dropbox/Max work/R/larval-sizes/data/covariates/sst/DMI_BAL_SST_L4_REP_OBSERVATIONS_010_016_1711802008633.nc (NC_FORMAT_NETCDF4):

     1 variables (excluding dimension variables):
        float analysed_sst[longitude,latitude,time]   (Contiguous storage)  
            units: kelvin
            _FillValue: NaN
            standard_name: sea_surface_foundation_temperature
            long_name: Analysed sea surface temperature

     3 dimensions:
        latitude  Size:355 
            standard_name: latitude
            long_name: Latitude
            units: degrees_north
            unit_long: Degrees North
            axis: Y
            valid_min: 48
            valid_max: 66
        longitude  Size:342 
            standard_name: longitude
            long_name: Longitude
            units: degrees_east
            unit_long: Degrees East
            axis: X
        time  Size:1462 
            standard_name: time
            long_name: Time
            units: seconds since 1970-01-01 00:00:00
            calendar: gregorian
            axis: T

    11 global attributes:
        Conventions: CF-1.11
        title: Baltic Sea SST analysis, daily reprocessed level 4 analysis
        institution: Danish Meteorological Institute, DMI
        source: ESA SST CCI, C3S and  CMEMS FMI and SMHI Sea ice concentration
        history: Version 1.0
        references: Høyer, J. L. and She, J., Optimal interpolation of sea surface temperature for the North Sea and Baltic Sea,  J. Mar. Sys., Vol 65, 1-4, pp. 176-189, 2007, Høyer, J. L., Le Borgne, P., & Eastwood, S. (2014). A bias correction method for Arctic satellite sea surface temperature observations. Remote Sensing of Environment, 146, 201-213.
        comment: IN NO EVENT SHALL DMI OR ITS REPRESENTATIVES BE LIABLE FOR ANY DAMAGES WHATSOEVER INCLUDING, WITHOUT LIMITATION, SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES OR DAMAGES FOR LOSS OF BUSINESS PROFITS OR SAVINGS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION OR OTHER PECUNIARY LOSS ARISING OUT OF THE USE OF OR THE BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION OR OTHER PECUNIARY LOSS ARISING OUT OF THE USE OF OR THE INABILITY TO USE THIS DMI PRODUCT, EVEN IF DMI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION SHALL APPLY TO CLAIMS OF PERSONAL INJURY TO THE EXTENT PERMITTED BY LAW. SOME COUNTRIES OR STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL DAMAGES AND, ACCORDINGLY, SOME PORTIONS OF THESE LIMITATIONS MAY NOT APPLY TO YOU. BY USING THIS PRODUCT, YOU HAVE ACCEPTED THAT THE ABOVE LIMITATIONS OR THE MAXIMUM LEGALLY APPLICABLE SUBSET OF THESE LIMITATIONS APPLY TO YOUR USE OF THIS PRODUCT. WARNING Some applications are unable to properly handle signed bytevalues. If values are encountered > 127, please subtract 256 from this reported value
        subset:source: ARCO data downloaded from the Marine Data Store using the MyOcean Data Portal
        subset:productId: SST_BAL_SST_L4_REP_OBSERVATIONS_010_016
        subset:datasetId: DMI_BAL_SST_L4_REP_OBSERVATIONS_010_016_202012
        subset:date: 2024-03-30T12:33:28.633Z
# Load and gather the temperature data in a tibble
temp_tibble <- callCopernicusCovariate("sst", messages = 1)
Processing - Gathering data from df 1/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 2/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 3/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 4/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 5/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 6/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 7/8. 
mutate: new variable 'date' (double) with 1,462 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 56,462,847 rows (91%), 5,278,875 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 211,155 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 8/8. 
mutate: new variable 'date' (double) with 1,521 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 5 unique values and 0% NA
filter: removed 69,342,258 rows (90%), 7,868,265 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 253,815 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables


Note - The data were filtered internally for January month

Completed
# Visualize temperature frequency distribution
hist(temp_tibble$sst)

# Visualize temperature spatial distribution
# plot_map +
#   geom_point(data = temp_tibble,
#              aes(X*1000, Y*1000, color = sst))

# Obtain temporal availability, this will be the temporal window to filter the data
unique(temp_tibble$year)
 [1] 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
[16] 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
[31] 2021 2022 2023
# Trim years we have temperature for (again, annoying! Fix the temperatures later)
d <- d |>
  filter(year %in% unique(temp_tibble$year))
filter: removed 1,082 rows (2%), 51,929 rows remaining
# Loop through all year combos, extract the temperatures at the data locations
d <- extractCovariateAtLocation(
  "sst", # Name of the covariate to extract. One of: sst, chlorophyll, depth.
  d, # A df containing the set of yearand locations to be evaluated.
  temp_tibble, # A df containing the covariate at location
  changesYearly = 1, # Is the covariate time variant (e.g. temp) or not (e.g. depth)
  "temp", # Name to give to the covariate evaluated at location in the df
  messages = 1 # dichotomous
)
Processing - Gathering covariate information at location for year 1/31:1992. 
filter: removed 46,717 rows (90%), 5,212 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 2/31:1993. 
filter: removed 49,549 rows (95%), 2,380 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 3/31:1994. 
filter: removed 50,279 rows (97%), 1,650 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 4/31:1995. 
filter: removed 50,326 rows (97%), 1,603 rows remaining
filter: removed 1,647,438 rows (95%), 84,462 rows remaining
Processing - Gathering covariate information at location for year 5/31:1996. 
filter: removed 51,685 rows (>99%), 244 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 6/31:1997. 
filter: removed 51,495 rows (99%), 434 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 7/31:1998. 
filter: removed 50,288 rows (97%), 1,641 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 8/31:1999. 
filter: removed 49,237 rows (95%), 2,692 rows remaining
filter: removed 1,647,438 rows (95%), 84,462 rows remaining
Processing - Gathering covariate information at location for year 9/31:2000. 
filter: removed 50,777 rows (98%), 1,152 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 10/31:2001. 
filter: removed 50,620 rows (97%), 1,309 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 11/31:2002. 
filter: removed 50,173 rows (97%), 1,756 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 12/31:2003. 
filter: removed 51,152 rows (99%), 777 rows remaining
filter: removed 1,647,438 rows (95%), 84,462 rows remaining
Processing - Gathering covariate information at location for year 13/31:2004. 
filter: removed 50,648 rows (98%), 1,281 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 14/31:2005. 
filter: removed 49,933 rows (96%), 1,996 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 15/31:2006. 
filter: removed 50,758 rows (98%), 1,171 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 16/31:2007. 
filter: removed 49,833 rows (96%), 2,096 rows remaining
filter: removed 1,647,438 rows (95%), 84,462 rows remaining
Processing - Gathering covariate information at location for year 17/31:2008. 
filter: removed 50,915 rows (98%), 1,014 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 18/31:2009. 
filter: removed 49,379 rows (95%), 2,550 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 19/31:2010. 
filter: removed 51,564 rows (99%), 365 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 20/31:2012. 
filter: removed 49,749 rows (96%), 2,180 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 21/31:2013. 
filter: removed 49,679 rows (96%), 2,250 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 22/31:2014. 
filter: removed 49,842 rows (96%), 2,087 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 23/31:2015. 
filter: removed 50,794 rows (98%), 1,135 rows remaining
filter: removed 1,647,438 rows (95%), 84,462 rows remaining
Processing - Gathering covariate information at location for year 24/31:2016. 
filter: removed 51,009 rows (98%), 920 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 25/31:2017. 
filter: removed 50,303 rows (97%), 1,626 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 26/31:2018. 
filter: removed 50,314 rows (97%), 1,615 rows remaining
filter: removed 1,689,669 rows (98%), 42,231 rows remaining
Processing - Gathering covariate information at location for year 27/31:2019. 
filter: removed 50,132 rows (97%), 1,797 rows remaining
filter: removed 1,638,906 rows (95%), 92,994 rows remaining
Processing - Gathering covariate information at location for year 28/31:2020. 
filter: removed 49,876 rows (96%), 2,053 rows remaining
filter: removed 1,681,137 rows (97%), 50,763 rows remaining
Processing - Gathering covariate information at location for year 29/31:2021. 
filter: removed 50,105 rows (96%), 1,824 rows remaining
filter: removed 1,681,137 rows (97%), 50,763 rows remaining
Processing - Gathering covariate information at location for year 30/31:2022. 
filter: removed 50,346 rows (97%), 1,583 rows remaining
filter: removed 1,681,137 rows (97%), 50,763 rows remaining
Processing - Gathering covariate information at location for year 31/31:2023. 
filter: removed 50,393 rows (97%), 1,536 rows remaining
filter: removed 1,681,137 rows (97%), 50,763 rows remaining


Completed

Satellite derived chlorophyll abundance

https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_029/description

## Load satellite derived chlorophyll
# Source: https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_029/download
# Print details
print(nc_open(paste(covPath, "chlorophyll", "cmems_mod_glo_bgc_my_0.25_P1D-m_1713795613611_01012017_12312022.nc", sep = "/")))
File /Users/maxlindmark/Dropbox/Max work/R/larval-sizes/data/covariates/chlorophyll/cmems_mod_glo_bgc_my_0.25_P1D-m_1713795613611_01012017_12312022.nc (NC_FORMAT_NETCDF4):

     6 variables (excluding dimension variables):
        float chl[longitude,latitude,depth,time]   (Contiguous storage)  
            units: mg m-3
            _FillValue: NaN
            standard_name: mass_concentration_of_chlorophyll_a_in_sea_water
            long_name: Total Chlorophyll
        float no3[longitude,latitude,depth,time]   (Contiguous storage)  
            units: mmol m-3
            _FillValue: NaN
            standard_name: mole_concentration_of_nitrate_in_sea_water
            long_name: Nitrate
        float nppv[longitude,latitude,depth,time]   (Contiguous storage)  
            units: mg m-3 day-1
            _FillValue: NaN
            standard_name: net_primary_production_of_biomass_expressed_as_carbon_per_unit_volume_in_sea_water
            long_name: Total Primary Production of Phyto
        float o2[longitude,latitude,depth,time]   (Contiguous storage)  
            units: mmol m-3
            _FillValue: NaN
            standard_name: mole_concentration_of_dissolved_molecular_oxygen_in_sea_water
            long_name: Dissolved Oxygen
        float po4[longitude,latitude,depth,time]   (Contiguous storage)  
            units: mmol m-3
            _FillValue: NaN
            standard_name: mole_concentration_of_phosphate_in_sea_water
            long_name: Phosphate
        float si[longitude,latitude,depth,time]   (Contiguous storage)  
            units: mmol m-3
            _FillValue: NaN
            standard_name: mole_concentration_of_silicate_in_sea_water
            long_name: Dissolved Silicate

     4 dimensions:
        depth  Size:1 
            standard_name: depth
            long_name: Depth
            units: m
            unit_long: Meters
            axis: Z
            positive: down
            valid_min: 0.505760014057159
            valid_max: 5902.05810546875
        latitude  Size:13 
            standard_name: latitude
            long_name: Latitude
            units: degrees_north
            unit_long: Degrees North
            axis: Y
            valid_min: -80
            valid_max: 90
        longitude  Size:21 
            standard_name: longitude
            long_name: Longitude
            units: degrees_east
            unit_long: Degrees East
            axis: X
        time  Size:2191 
            standard_name: time
            long_name: Time
            units: seconds since 1970-01-01 00:00:00
            calendar: gregorian
            axis: T

    12 global attributes:
        Conventions: CF-1.11
        title: Daily mean fields for product GLOBAL_REANALYSIS_BIO_001_029
        institution: Mercator Ocean
        producer: CMEMS - Global Monitoring and Forecasting Centre
        source: MERCATOR FREEBIORYS2V4
        credit: E.U. Copernicus Marine Service Information (CMEMS)
        contact: servicedesk.cmems@mercator-ocean.eu
        references: http://marine.copernicus.eu
        subset:source: ARCO data downloaded from the Marine Data Store using the MyOcean Data Portal
        subset:productId: GLOBAL_MULTIYEAR_BGC_001_029
        subset:datasetId: cmems_mod_glo_bgc_my_0.25_P1D-m_202112
        subset:date: 2024-04-22T14:20:13.611Z
# Load and gather the temperature data in a tibble
chl_tibble <- callCopernicusCovariate("chlorophyll", messages = 1)
Processing - Gathering data from df 1/4. 
mutate: new variable 'date' (double) with 2,191 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 6 unique values and 0% NA
filter: removed 276,690 rows (92%), 25,668 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 828 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 2/4. 
mutate: new variable 'date' (double) with 2,922 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 8 unique values and 0% NA
filter: removed 369,012 rows (92%), 34,224 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 1,104 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 3/4. 
mutate: new variable 'date' (double) with 3,288 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 9 unique values and 0% NA
filter: removed 415,242 rows (92%), 38,502 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 1,242 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables
Processing - Gathering data from df 4/4. 
mutate: new variable 'date' (double) with 2,556 unique values and 0% NA
mutate: new variable 'month' (double) with 12 unique values and 0% NA
        new variable 'day' (integer) with 31 unique values and 0% NA
        new variable 'year' (double) with 7 unique values and 0% NA
filter: removed 322,782 rows (92%), 29,946 rows remaining
group_by: 3 grouping variables (year, longitude, latitude)
summarise: now 966 rows and 4 columns, 2 group variables remaining (year, longitude)
ungroup: no grouping variables


Note - The data were filtered internally for January month

Completed
# Visualize chlorophyll frequency distribution
hist(chl_tibble$chl)

# Visualize chlorophyll spatial distribution
# plot_map +
#   geom_point(data = chl_tibble,
#              aes(X*1000, Y*1000, color = chl))

# Obtain temporal availability, this will be the temporal window to filter the data
sort(unique(chl_tibble$year))
 [1] 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
[16] 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
# Trim years we have chlorophyll for.
d <- d |>
  filter(year %in% unique(chl_tibble$year)) # We loose 13% of the data by including chl.
filter: removed 6,748 rows (13%), 45,181 rows remaining
# Loop through all year combos, extract the chl at the data locations
d <- extractCovariateAtLocation(
  "chl",
  d,
  chl_tibble,
  changesYearly = 1,
  "chl",
  messages = 1
)
Processing - Gathering covariate information at location for year 1/29:1993. 
filter: removed 42,801 rows (95%), 2,380 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 2/29:1994. 
filter: removed 43,531 rows (96%), 1,650 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 3/29:1995. 
filter: removed 43,578 rows (96%), 1,603 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 4/29:1996. 
filter: removed 44,937 rows (99%), 244 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 5/29:1997. 
filter: removed 44,747 rows (99%), 434 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 6/29:1998. 
filter: removed 43,540 rows (96%), 1,641 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 7/29:1999. 
filter: removed 42,489 rows (94%), 2,692 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 8/29:2000. 
filter: removed 44,029 rows (97%), 1,152 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 9/29:2001. 
filter: removed 43,872 rows (97%), 1,309 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 10/29:2002. 
filter: removed 43,425 rows (96%), 1,756 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 11/29:2003. 
filter: removed 44,404 rows (98%), 777 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 12/29:2004. 
filter: removed 43,900 rows (97%), 1,281 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 13/29:2005. 
filter: removed 43,185 rows (96%), 1,996 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 14/29:2006. 
filter: removed 44,010 rows (97%), 1,171 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 15/29:2007. 
filter: removed 43,085 rows (95%), 2,096 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 16/29:2008. 
filter: removed 44,167 rows (98%), 1,014 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 17/29:2009. 
filter: removed 42,631 rows (94%), 2,550 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 18/29:2010. 
filter: removed 44,816 rows (99%), 365 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 19/29:2012. 
filter: removed 43,001 rows (95%), 2,180 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 20/29:2013. 
filter: removed 42,931 rows (95%), 2,250 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 21/29:2014. 
filter: removed 43,094 rows (95%), 2,087 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 22/29:2015. 
filter: removed 44,046 rows (97%), 1,135 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 23/29:2016. 
filter: removed 44,261 rows (98%), 920 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 24/29:2017. 
filter: removed 43,555 rows (96%), 1,626 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 25/29:2018. 
filter: removed 43,566 rows (96%), 1,615 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 26/29:2019. 
filter: removed 43,384 rows (96%), 1,797 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 27/29:2020. 
filter: removed 43,128 rows (95%), 2,053 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 28/29:2021. 
filter: removed 43,357 rows (96%), 1,824 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining
Processing - Gathering covariate information at location for year 29/29:2022. 
filter: removed 43,598 rows (96%), 1,583 rows remaining
filter: removed 4,002 rows (97%), 138 rows remaining


Completed

Satellite derived depth

https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_029/description

# Generate a depth box containing the bathymetries.
depth_box <- getNOAA.bathy(min(d$lon) - .1, max(d$lon) + .1, min(d$lat) - .1, max(d$lat) + .1)
Querying NOAA database ...
This may take seconds to minutes, depending on grid size
Building bathy matrix ...
# Obtain depth at locations.
d <- cbind(
  d,
  get.depth(depth_box, x = d$lon, y = d$lat, locator = F)["depth"]
)

## Convert to strictly positive values.
d$depth <- d$depth * (-1)

# Check
plot_map +
  geom_point(
    data = d,
    aes(X * 1000, Y * 1000, color = depth)
  )

d <- d |> tidylog::filter(depth > 0)
filter: removed 94 rows (<1%), 45,087 rows remaining

Check covariates

# Get the proportion of observations not assigned with a covariate value at prior steps
colMeans(is.na(d))
     haul_id      species    length_mm       number         year          day 
0.0000000000 0.0000000000 0.0000000000 0.4898973096 0.0000000000 0.0000000000 
       month          lat          lon     temp_obs       period         yday 
0.0000000000 0.0000000000 0.0000000000 0.5696542241 0.0000000000 0.0000000000 
           X            Y   life_stage         temp          chl        depth 
0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0001108967 0.0000000000 

Plot response variables

d |>
  summarise(n = n(), .by = species) |>
  arrange(desc(n))
                    species     n
1  Crystallogobius linearis 14136
2           Clupea harengus 11580
3              Aphia minuta  6328
4          Pholis gunnellus  5081
5         Pomatoschistus sp  1922
6               Ammodytidae  1157
7        Sardina pilchardus  1080
8       Chirolophis ascanii  1054
9    Syngnathus rostellatus  1043
10         Microstomus kitt   609
11   Myoxocephalus scorpius   529
12      Agonus cataphractus   339
13        Anguilla anguilla   229
# Distribution of data
ggplot(d, aes(length_mm)) +
  geom_histogram() +
  facet_wrap(~species, scales = "free")

# Effect of day of the year
ggplot(d, aes(yday, length_mm)) +
  geom_point(size = 0.4, alpha = 0.4) +
  geom_smooth(method = "lm") +
  facet_wrap(~species, scales = "free")

# Effect of year
ggplot(d, aes(year, length_mm)) +
  geom_point(size = 0.4, alpha = 0.4) +
  geom_smooth(method = "lm") +
  facet_wrap(~species, scales = "free")

# Effect of temperature
ggplot(d, aes(temp, length_mm)) +
  geom_point(size = 0.4, alpha = 0.4) +
  geom_smooth(method = "lm") +
  # geom_smooth() +
  facet_wrap(~species, scales = "free")

# Effect of chlorophyll
ggplot(d, aes(chl, length_mm)) +
  geom_point(size = 0.4, alpha = 0.4) +
  geom_smooth(method = "lm") +
  # geom_smooth() +
  facet_wrap(~species, scales = "free")

Save data

d <- d |> drop_na(temp)
drop_na: no rows removed
write_csv(d, paste0(home, "/data/clean/larval_size.csv"))